rm(list = ls())
gc(verbose = FALSE)

args = commandArgs(trailingOnly = "TRUE")
args

arg1 <- args[1]
arg2 <- args[2]

# Define library folder
setwd(arg1)
.libPaths(arg2)
.libPaths(.libPaths()[1])

pkg <- c("nnls", "SuperLearner", "dplyr", "arm", "onehot", "grf", "xgboost", "caret", "data.table",
         "knitr", "kableExtra", "magrittr", "kimisc", "data.table", "readstata13", "DescTools", "scales", "ggplot2")
lapply(pkg, library, character.only = T, lib.loc = arg2)


# read the data
ihwap_complete = read.csv(paste0(arg1, "/Data/ihwap_full.csv"))

# drop some variables not to be used
ihwap_full = subset(ihwap_complete, select = -c(Household, ExistingConsumption, ProjectedConsumption, end_day, kfold, kfold2, rowid))
ihwap_completeX = subset(ihwap_complete, select = -c(Household, ExistingConsumption, ProjectedConsumption, end_day, kfold, kfold2, rowid, total_mmbtu, treated))

# will train model only with untreated homes
ihwap_full = ihwap_full[which(ihwap_full$treated==0), ]

### Data Frames for the models
# dependent variable
train_y = as.matrix(subset(ihwap_full, select = c("total_mmbtu")))
# independent variables
train_x = subset(ihwap_full, select = -c(total_mmbtu, treated))

######## Setup SuperLearner
# Function to create a list of model names
expandingList <- function(capacity = 10) {
    buffer <- vector('list', capacity)
    length <- 0
    
    methods <- list()
    
    methods$double.size <- function() {
        buffer <<- c(buffer, vector('list', capacity))
        capacity <<- capacity * 2
    }
    
    methods$add <- function(val) {
        if(length == capacity) {
            methods$double.size()
        }
        
        length <<- length + 1
        buffer[[length]] <<- val
    }
    
    methods$as.list <- function() {
        b <- buffer[0:length]
        return(b)
    }
    
    methods
}
SL.library<-expandingList()


### different models to try
# random forest
randomForest.tune <- list(ntree = c(1000, 2000),
			nodesize = c(30),
			maxnodes = c(500, 1000))

# Set detailed names = T so we can see the configuration for each function.
# Also shorten the name prefix.
randomForest <- create.Learner("SL.randomForest", tune = randomForest.tune, detailed_names = T, name_prefix = "randomForest")
for(i in 1:length(randomForest$names)){
    SL.library$add(c(randomForest$names[i]))
}

fit.gaselec.SL = SuperLearner(Y=train_y, X=train_x, 
                            SL.library=SL.library$as.list(), family=gaussian(),
                            method="method.NNLS", verbose=TRUE, cvControl = list(V = 5))
saveRDS(fit.gaselec.SL, paste0(arg1, "/Results/Model_Outputs/predictpre_model_rf.rds"))

# predictions for pre-data
in_preds = as.data.frame(fit.gaselec.SL$library.predict)
cv_preds = as.data.frame(fit.gaselec.SL$Z)
colnames(cv_preds) =  sprintf('cv_%s', colnames(in_preds)) 
colnames(in_preds) =  sprintf('in_%s', colnames(in_preds))

data_export = subset(ihwap_complete[which(ihwap_complete$treated==0), ], select = c(Household, end_day, end_month, end_year))
data_export = as.data.frame(cbind(data_export, in_preds, cv_preds))

saveRDS(data_export, paste0(arg1, "/Results/Model_Outputs/predictpre_results_rf.rds"))
write.csv(data_export, paste0(arg1, "/Results/Model_Outputs/predictpre_results_rf.csv"), row.names = FALSE)

# predictions for post-data
predictions = predict(fit.gaselec.SL, ihwap_completeX, onlySL = T)
data_export2 = subset(ihwap_complete, select = c(Household, end_day, end_month, end_year))
data_export2$pred_rf = unlist(as.numeric(predictions$pred))

saveRDS(data_export2, paste0(arg1, "/Results/Model_Outputs/predictpost_results_rf.rds"))
write.csv(data_export2, paste0(arg1, "/Results/Model_Outputs/predictpost_results_rf.csv"), row.names = FALSE)
